Flame Graph
Data
Options ▾
<expr>MemoryTime
profvis({
library("gridExtra") #for saving png files in a specific order into pdf
library("ggplot2")
library('ggdendro')
# library("TSclust")
library("ggmap") #used to plot maps
library("maps")
library(scales) # for function alpha()
library("compiler") # to speed up the computations!
library("plyr")
library("hexbin") #for hexoganal binning
library("rgeos") #for creating maps
library("png") #for reading png files
library("grid") #for arranging png files
library("data.table") #for faster creation of crosstables from data set & for faster searches of datasets; brings about a lot of speed-up! https://github.com/Rdatatable/data.table/wiki/Getting-started
root_path <- "C:/Users/DrosoNeuro/Dropbox/UZH_Master/Masterarbeit/TwitterEpi/ExploratoryAnalysis" # defining root_path containing all relevant documents
#NB!! the BEST WAY TO CHOOSE THE BIN IS probably ***** "FD" *****:
#http://stats.stackexchange.com/questions/798/calculating-optimal-number-of-bins-in-a-histogram-for-n-where-n-ranges-from-30
# # LOADING and MERGING DATA FRAMES ------
#
# setwd(root_path) # setting WD
# #function to make selection of dataframe based on pre_set coordinates
#
# #loading files from sick patients
# #see http://stackoverflow.com/questions/11433432/importing-multiple-csv-files-into-r for explanation about reading several csv-files at once
# setwd("C:/Users/DrosoNeuro/Dropbox/UZH_Master/Masterarbeit/TwitterData/tweets_from_todd/csv_files/sick_csv") # temporarily set WD to folder with files from healthy Twitter users
#
# temp = list.files(pattern="*.csv") #read names of all .csv files
#
# #creates names from csv-files in folder;
# names <- setNames(temp, make.names(gsub("*.csv$", "", temp))) #gsub uses regex to replace the specified patterns within a name
#
# #loading df into environment
# list2env(lapply(names,read.csv, header=FALSE), envir = .GlobalEnv)
#
# #create a list of all the dataframes
# sick_list <- lapply(attr(names,"names"),get)
#
# #combine into a single dataframe
# sick_df <- do.call("rbind",sick_list)
#
# remove(list = attr(names,"names"))#removing single df to save RAM
# remove(sick_list)#removing sick_list to save RAM
#
# colnames(sick_df)=c('userID','longitude','latitude','time','sick','state')
# alarm()
#
# #loading data from healthy Twitter users
# setwd("C:/Users/DrosoNeuro/Dropbox/UZH_Master/Masterarbeit/TwitterData/tweets_from_todd/csv_files/one_hundred_csv") # temporarily set WD to folder with files from healthy Twitter users
# temp = list.files(pattern="*.csv") #read names of all .csv files
#
# #creates names from csv-files in folder;
# names <- setNames(temp, make.names(gsub("*.csv$", "", temp))) #gsub uses regex to replace the specified patterns within a name
#
# #loading df into environment
# list2env(lapply(names,read.csv, header=FALSE), envir = .GlobalEnv)
#
# #create a list of all the dataframes
# healthy_list <- lapply(attr(names,"names"),get)
#
# #combine into a single dataframe
# healthy_df <- do.call("rbind",healthy_list)
#
# remove(list = attr(names,"names"))#removing single df to save RAM
# remove(healthy_list)#removing sick_list to save RAM
# remove(list= c("names","temp"))
#
# colnames(healthy_df)=c('userID','longitude','latitude','time','sick','state')
# alarm()
#
# setwd(root_path) # set WD back
#
# save.image(file="Twitter_Datasets.RData") #saving loaded dataset to prevent loading it from the excel-files the next time
# EXPLORATORY DATA ANALYSIS ------
setwd(root_path) # set WD back
load(file="Twitter_Datasets.RData") #if the code above has been executed once, you can uncomment it and start directly from here
title_plot<-"All tweets from " #generic title plot used in some functions
# to_analyse <- "healthy_df"
# rm(list=setdiff(ls(), to_analyse)) #removes all entries from workspace except for the dataframe that shall be analysed
#
#funtion to make selection of dataframe based on coordinate (lon_west,lon_est,lat_south,lat_north)
coord_selection <- function(dataframe,coord_selec)
{
selec <- dataframe[which(dataframe[,"longitude",]>=coord_selec[1] & dataframe[,"longitude"] <= coord_selec[2] & dataframe[,"latitude"] >= coord_selec[3] & dataframe[,"latitude"] <= coord_selec[4]),]
}
#funtion to make selection of dataframe based on coordinate (lon_west,lon_est,lat_south,lat_north); also returns index
coord_selection2 <- function(dataframe,coord_selec) #
{
selec <- dataframe[which(dataframe[,"longitude",]>=coord_selec[1] & dataframe[,"longitude"] <= coord_selec[2] & dataframe[,"latitude"] >= coord_selec[3] & dataframe[,"latitude"] <= coord_selec[4]),]
index <- which(dataframe[,"longitude",]>=coord_selec[1] & dataframe[,"longitude"] <= coord_selec[2] & dataframe[,"latitude"] >= coord_selec[3] & dataframe[,"latitude"] <= coord_selec[4])
return(list(selec,index))
}
coord_USA <- c(-125,-66,25,50) #select only tweets from mainland USA
sick_df <- coord_selection(sick_df, coord_USA)
healthy_df <- coord_selection(healthy_df,coord_USA)
explore_data <- function(dataframe,sickness_state){ #"sickness_state" takes values "sick" or "healthy" and signifies the state that the users represented in the dataste *should* be in
all_users<-unique(dataframe[,1]) #unique returns a vector, data frame or array like x but with duplicate elements/rows removed; in this case = unique return of user_ID
num_users <- length(all_users)
sick_position <- which(dataframe[,5]==1) #gets position of tweets labelled as asick
num_sick_tweets<-sum(dataframe[,5]==1) #returns number of tweets that are labelled as "sick"
#sick_tweets<-dataframe[sick_position,5] #returns entries that are labelled as sick
# NB! n sick tweets != n sick users!!!
sick_users<-unique(dataframe[sick_position,1])
num_sick_users <- length(sick_users)
#getting healthy
healthy_position <- which(dataframe[,5]==0)
#healthy_tweets <- unique(dataframe[healthy_position,5]) #returns entries that are labelled as healthy
num_healthy_tweets <- sum(dataframe[,5]==0)
healthy_users <- unique(dataframe[healthy_position,1]) #getting healthy users
num_healthy_users <- length(healthy_users)
#check the total number of false labels
if (sickness_state == "sick"){ #checking whether there are any users in a "sick" dataset that have never been sick, i.e. that healthy_users that don't show up in sick_users
false_label <- healthy_users[!(healthy_users %in% sick_users)]
num_false_label <- length(false_label)
}
else if (sickness_state == "healthy"){
false_label <- sick_users
num_false_label <- num_sick_users
}
out <- list(all_users,num_users,sick_position,num_sick_tweets,sick_users,num_sick_users,healthy_position,num_healthy_tweets,healthy_users,num_healthy_users,false_label,num_false_label)
names(out) <- c("all_users","num_users", "sick_position","num_sick_tweets","sick_users","num_sick_users","healthy_position", "num_healthy_tweets", "healthy_users","num_healthy_users","false_label","num_false_label")
return(out)
}
#get preliminary info from datasets
explore_sick <- explore_data(sick_df,"sick")
str(explore_sick)
explore_sick <- list(explore_sick$false_label) #prune list to save memory
names(explore_sick) <- "false_label"
explore_healthy <- explore_data(healthy_df,"healthy")
str(explore_healthy)
explore_healthy <- list(explore_healthy$false_label) #reduce size of list to save memory
names(explore_healthy) <- "false_label"
# ---------------- here we analyse the data `in space' ------------
# # tweets on maps using scatterplots----
#
# plot_location <- function(dataframe,explore,tag)
# {
# dir.create("img_tmp") #create new directory to story images
# setwd(paste0(root_path,"/img_tmp"))
# ###set-up###
# #function print spatial distribution of sick tweets
# print_map <- function(my_map, my_title, my_coord, colo="red",pt_size=0.5){
# ggmap(my_map)+ggtitle(my_title) + geom_point(aes_string(my_coord[,"longitude"],my_coord[,"latitude"]),color=colo,data=my_coord,alpha=.3,size=pt_size)
# }
#
# #prune dataset to save memory
# dataframe <- dataframe[,c(1:3,5)]
#
# #define pt-size
# pt_size_USA <- 1.5
# pt_size_local <- 7.5
#
# #create names for each map
# img_names <- c("sicktweets","sicktweets_local","healthytweets","healthytweets_local","mislabelledtweets","mislabelledtweets_local")
#
# filenames <- paste(img_names,tag,sep="_")
# filenames <- paste(filenames,"png",sep=".")
#
# #define size of each image (in inchex)
# img_size <- 20
#
# ###create maps with tweets for the whole continent###
# my_ggmap<-get_map(location = "Kansas",zoom = 3,source = "google", maptype = "terrain",color = 'bw') # my_ggmap<- map_data('world'); downloading map with focus on Kansas, zoom = 3 (continent)
#
# ##print all tweets labelled as "sick" in the whole USA##
# tot_sick <- dataframe[dataframe[,"sick"]==1,] #subset of all tweets labelled as "sick"
#
# #png(filename=filenames[1],width=2000,height=2000) #open pdf to save images
# p <- print_map(my_ggmap,my_title="Sick tweets",my_coord=tot_sick,colo="red",pt_size=pt_size_USA)
# p <- print
# ggsave(file=filenames[1],width=img_size,height=img_size)
# #dev.off() #close pdf
#
# remove(list=c("p","tot_sick")) #remove map created and dataset to save memory
#
# ##print all tweets labelled as "healthy" in the whole USA##
# tot_healthy <- dataframe[dataframe[,"sick"]==0,]
#
# p <- print_map(my_ggmap,my_title="Healthy tweets",my_coord=tot_healthy,colo="blue",pt_size=pt_size_USA)
# ggsave(file=filenames[3],width=img_size,height=img_size)
#
# remove(list=c("p","tot_healthy")) #remove map created and dataset to save memory
#
# ##print all tweets mislabelled in the whole USA##
# tot_mislabelled <- dataframe[which(dataframe[,"userID"] %in% explore$false_label),] #gets all tweets from those users who are in the wrong category ("sick" users in the "healthy" dataset or "healthy" users in the "sick" data set)
# p <- print_map(my_ggmap,my_title="Mislabelled Tweets",my_coord=tot_mislabelled,col="deeppink",pt_size=pt_size_USA)
# ggsave(file=filenames[5],width=img_size,height=img_size)
#
# remove(list=c("p","tot_mislabelled","my_ggmap")) #remove map created and dataset to save memory; also remove ggmap of USA since it won't be used afterwards
#
# ###create local maps with tweets###
# #create google-map for a certain region
# my_ggmap_local <- get_map(location = "NewYork", zoom= 8, source="google",maptype = "terrain",color = "bw")#local map
# coord_local <- as.numeric(attr(my_ggmap_local,"bb"))#get coordinates of map
# coord_local <- c(coord_local[2],coord_local[4],coord_local[1],coord_local[3]) #reorder coordinates to make it work for function coord_selection
#
# local_selection <- coord_selection2(dataframe,coord_local) #select only those values which are within the local map region
#
# local_index <- local_selection[[2]]
# local_selection <- local_selection[[1]]
# remove(dataframe) #to save memory
#
# ##print all tweets labelled as "sick" in the selected region##
# local_sick <- local_selection[local_selection[,"sick"]==1,] #get all local tweets labelled as "sick"
# #png(filename=filenames[2],width=2000,height=2000)
# p <- print_map(my_ggmap_local,my_title = "Sick tweets", local_sick,"red",pt_size=pt_size_local)
# ggsave(file=filenames[2],width=img_size,height=img_size)
# #dev.off() #close pdf
#
# remove(list=c("p","local_sick")) #remove map created and dataset to save memory;
#
# ##print all tweets labelled as "healthy" in the selected region##
# local_healthy <- local_selection[local_selection[,"sick"]==0,]
#
# p <-print_map(my_ggmap_local,my_title="Healthy tweets",local_healthy,colo="blue",pt_size=pt_size_local)
# ggsave(file=filenames[4],width=img_size,height=img_size)
#
# remove(list=c("p","local_healthy")) #remove map created and dataset to save memory
#
# #print all tweets labelled as "healthy" in the selected region##
# local_mislabelled <- local_selection[which(local_selection[,"userID"] %in% explore$false_label[local_index]),]
#
# p <- print_map(my_ggmap_local,"Mislabelled Tweets",my_coord=local_mislabelled,col="deeppink",pt_size=pt_size_local)
# ggsave(file=filenames[6],width=img_size,height=img_size)
#
# remove(list=c("p","local_mislabelled","my_ggmap_local")) #remove map created and dataset to save memory; also delete local google-map
#
# #reading all created images
# plots <- lapply(ll <- list.files(patt='.*[.]png'),function(x){
# img <- as.raster(readPNG(x))
# grid::rasterGrob(img, interpolate = FALSE)
# })
# #changing wd to root_path and saving images in single pdf
# setwd(paste0(root_path))
# ggsave(paste0("/plots/","scatterplots_",tag,".pdf"), marrangeGrob(grobs=plots, nrow=1, ncol=1))
# unlink("img_tmp",recursive=T) #delete directory with png-images
# }
#
# plot_location(sick_df,explore_sick,"sick_df")
# plot_location(healthy_df,explore_healthy,"healthy_df")
#
#tweets on maps using hexbin plots----
plot_location_hexbin <- function(dataframe,explore,tag) #function print spatial distribution of sick tweets
{
###set-up###
dataframe <- dataframe[,c(1:3,5)] #prune dataset to make it smaller
coord_cont <- c(-125,-66,25,50) #select only tweets from mainland USA
coord_local <- c(-80,-66,38,43) #select only tweets on the East Coast
#set the colors, number intervals, interval location
cr <- colorRampPalette(c("green","blue"))
#number of bins to be use
xbins = 100
#create titles for each map
img_names <- c("alltweets_cont","sicktweets_cont","healthytweets_cont","mislabelled_cont","alltweets_local", "sicktweets_local", "healthytweets_local","mislabelledtweets_local")
filenames <- paste(img_names,tag,sep="_")
# #create a list to store plot; only needed if you want to record plot with recordPlot()
# num.plots <- 8
# my.plots <- vector(num.plots, mode='list')
#create a list to store hexbins
num.plots <- 8
my.bins <- vector(num.plots,mode="list")
# create hexbins for continent data
continent <- coord_selection(dataframe,coord_cont)
my.bins[[1]] <- hexbin(continent$longitude,continent$latitude,xbins=xbins,IDs=T)
temp <- continent[continent[,"sick"]==1,]
my.bins[[2]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T)
temp <- continent[continent[,"sick"]==0,]
my.bins[[3]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T)
temp <- continent[which(continent[,"userID"] %in% explore$false_label),]
my.bins[[4]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T)
remove(continent) #to save memory
# create hexbins for local data
local <- coord_selection(dataframe,coord_local)
my.bins[[5]] <- hexbin(local$longitude,local$latitude,xbins=xbins,IDs=T)
temp <- local[local[,"sick"]==1,]
my.bins[[6]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T)
temp <- local[local[,"sick"]==0,]
my.bins[[7]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T)
temp <- local[which(local[,"userID"] %in% explore$false_label),]
my.bins[[8]] <- hexbin(temp$longitude,temp$latitude,xbins=xbins,IDs=T)
remove(local) #to save memory
###create maps with tweets for the whole continent###
##print all tweets in the whole USA##
pdf(paste0("plots/",'HexbinPlots_',tag,'.pdf'), onefile=TRUE)
for (i in 1:num.plots)
{
my.bins[[i]]@count <- log(my.bins[[i]]@count)#log-transforming counts in order to improve readability
gplot.hexbin(my.bins[[i]],style="colorscale",pen=0,border= 'white', minarea = 0.01, maxarea = 1,colramp=cr,legend=1.5,mincnt=0, xlab="longitude",ylab="latitude",main=filenames[i],colorcut=seq(0,1,length=10))
}
graphics.off()
remove(list=c("my.bins"))
}
plot_location_hexbin(sick_df,explore_sick,"sick_df")
plot_location_hexbin(healthy_df,explore_healthy,"healthy_df")
# histogram of longitude and latitude ----
hist_coord <- function(dataframe, tag,explore){
#create filenames#
filenames <- c("all_tweets","all_tweets","sicktweets","sicktweets","healthytweets","healthytweets","mislabelled","mislabelled")
filenames <- paste(filenames,tag,sep="_")
filenames <- paste(filenames,c("lon","lat"))
#define root for transformation
root <- 1/2
root_tag <- as.character(round(root,2))
#prune dataframe to save memory
dataframe <- dataframe[,c(1:3,5)]
#create a list to store hexbins
num.plots <- 8
my.subsets <- vector(num.plots,mode="list")
# create subsets of dataframe for analysis
my.subsets[[1]] <- dataframe
my.subsets[[2]] <- dataframe[dataframe[,"sick"]==1,]
my.subsets[[3]] <- dataframe[dataframe[,"sick"]==0,]
my.subsets[[4]] <- dataframe[which(dataframe[,"userID"] %in% explore$false_label),]
remove(dataframe) #to save memory
##create histogram of whole dataset##
pdf(file=paste0("plots/","HistogramOfCoordinates_",tag,".pdf"),onefile=T,width=20)
par(mfrow=c(1,2))
for (i in 1:(num.plots/2)){
h <- hist(my.subsets[[1]][,"longitude"], breaks = length(unique(my.subsets[[1]][,"longitude"])),plot=F) #save histdata
h$counts <- (h$counts)**root
plot(h,xlab="longitude",ylab=paste0("(frequency)^",root_tag), main=filenames[(2*i)-1])
remove(list =c("h"))
h <- hist(my.subsets[[1]][,"latitude"], breaks = length(unique(my.subsets[[1]][,"latitude"])),plot=F) #save histdata
h$counts <- (h$counts)**root
plot(h,xlab="latitude",ylab=paste0("(frequency)^",root_tag), main=filenames[(2*i)])
remove(list =c("h"))
my.subsets <- my.subsets[-1] #removing subset that was just used to save memory
}
dev.off()
}
hist_coord(sick_df, "sick_df",explore_sick)
hist_coord(healthy_df,"healthy_df",explore_healthy)
### ---------------- here we analyse the user/state activity ------------
##hist of US states activity ----
#state_abbr = c("dc","as","gu","mp","vi","pr","hi","ak","ct","me","ma","nh","ri","vt","nj","ny","de","md","pa","va","wv","al","fl","ga","ky","ms","nc","sc","tn","il","in","mi","mn","oh","wi","ar","la","nm","ok","tx","ia","ks","mo","ne","co","mt","nd","sd","ut","wy","az","ca","nv","id","or","wa")
#state_names = c("district of columbia","samoa","guam","northern mariana islands","virgin islands","puerto rico","hawaii","alaska","connecticut","maine","massachusetts","new hampshire","rhode island","vermont","new jersey","new york","delaware","maryland","pennsylvania","virginia","west virginia","alabama","florida","georgia","kentucky","mississippi","north carolina","south carolina","tennessee","illinois","indiana","michigan","minnesota","ohio","wisconsin","arkansas","louisiana","new mexico","oklahoma","texas","iowa","kansas","missouri","nebraska","colorado","montana","north dakota","south dakota","utah","wyoming","arizona","california","nevada","idaho","oregon","washington")
# p <- ggplot(data = states_activity, aes(x = states_activity$freq))+
# geom_histogram( ) + ggtitle(paste0(title_plot,tag,'- States activity'))+
# xlab('numb. of tweets') + ylab("num. of states") + theme_bw()+scale_y_discrete()
#function that takes dataframe and plot histogram of tweet activity with regard to state and vice versa
hist_states <- function(dataframe,tag,title_plot = "All tweets from "){
num_states<-length(unique(dataframe[,"state"]))
states_activity<-as.data.frame(table(dataframe[,"state"]))
colnames(states_activity) <- c("state","freq")
filenames <- paste0("plots/","histogram_states_",tag,".pdf")
pdf(file=filenames,width=14)
par(mfrow=c(1,2))
hist(states_activity$freq, breaks = "FD",main= paste0(title_plot,tag,' - States activity'), xlab = 'numb. of tweets', ylab = "num. of states")
barplot(as.array(states_activity$freq),names.arg=states_activity$state,ylab="num. of tweets",xlab = "states",main=paste0(title_plot,tag,' States activity'))
dev.off()
}
hist_states(sick_df, "sick_df")
hist_states(healthy_df,"healthy_df")
## plot histogram of User ID ----
user_activity <- function(dataset,tag){#dataset has to be in the form of a data.table; preferentially with key already set to "userID"
dataset <- data.table(dataset)
setkey(dataset,"userID")
user_ac <- dataset[,.N,by=.(userID)] #".N" is a shortcut for length(current_object), in this case, it outputs the nunber of occurences of each user in the column userID; .() is a shorthand for "list"
filenames <- paste0("plots/","user_activity_",tag,".pdf")
pdf(file=filenames)
activity_plot <- ggplot(data = user_ac, aes(x = user_ac[,N]))+
geom_histogram( ) + ggtitle(paste0('user activity_',tag))+
xlab('numb. of tweets') + ylab("num. of users") + theme_bw()
print(activity_plot)
dev.off()
}
user_activity(sick_df,"sick_df")
user_activity(healthy_df,"healthy_df")
}) #end of profvis
load<GC>[explore_datauniqueplot_location_hexbin[[[<GC>[[which&[hist_coordhist_coord[[[.data.frame[histhist.default.Callhisthist.default.Callhisthist.default.Callhisthist.default.Callhist_statesas.data.frametablefactoras.characterffff020,00040,00060,00080,000100,000120,000140,000160,000180,000